library(tidyverse)
library(tidytext)
library(widyr)
library(ggraph)
library(igraph)
library(tidygraph)
library(plotly)
setwd("~/dataviz2021/Group_Project/Andrew_Text")
a2021 <- read_csv("airline2021.csv")
#a2015 <- read_csv("airlinetweets2015.csv")
df <- a2021 %>%
  select(status_id,created_at,text,
         mentions_screen_name,location,retweet_count, hashtags)
check <- df %>%
  mutate(mentions_screen_name = str_extract_all(mentions_screen_name, 
                                                '(?<=")[A-Za-z]+')) %>%
   unnest(., mentions_screen_name) %>%
  mutate(hashtags = str_extract_all(hashtags, '(?<=")[A-Za-z]+')) %>%
   unnest(., hashtags) %>%
  group_by(status_id) %>%
  mutate(mentions_screen_name =
           ifelse(is.na(mentions_screen_name),hashtags,mentions_screen_name)) %>%
  mutate(hashtags = 
           ifelse(is.na(hashtags), mentions_screen_name,hashtags)) %>%
  mutate(mentions_screen_name = tolower(mentions_screen_name))
# Tweets that only reference on airline
clean <- check %>%
  filter(str_detect(mentions_screen_name, 
                    'alaska|delta|united|southwest|americanair')) %>%
  group_by(status_id) %>%
  mutate(n = n(),
         is_dupe = ifelse(n > 1,1,0)) %>%
  filter(is_dupe == 0) %>%
  filter(row_number()==1) %>%
  select(status_id,created_at,text,
         mentions_screen_name,location,retweet_count) %>%
  rename(airline = mentions_screen_name) %>%
  mutate(airline = case_when(airline == "alaskaair" ~ "Alaska",
                             airline %in% c("americanair","americanairlines",
                                            "americanairlnes") ~
                               "American",
                             airline %in% c("delta", "deltaairline") ~ "Delta",
                             airline == "southwestair" ~ "Southwest",
                             airline == "united" ~ "United")) %>%
  ungroup()
tidy_2021 <- clean %>%
  mutate(text = tolower(text)) %>%
  unnest_tokens(output = word, input = text) %>% 
  anti_join(bind_rows(stop_words, data.frame(word = c("rt", "https"), 
                                             lexicon = "TWITTER")), 
            by = "word") %>%
  mutate(word =  gsub("[[:punct:][:blank:]]+", "", word)) %>%
  mutate(word = gsub("[0-9]+", "", word)) %>%
  mutate(word =  gsub("*\\b[[:alpha:]]{1,2}\\b *", "", word)) %>%
  mutate(word =  gsub("\\b[A-Z]+\\b", "", word)) %>%
  mutate(word = gsub("^ +| +$|( ) +", "\\1", word)) %>%
  mutate(word = str_replace(word,"alaska|delta|united|southwest|americanair","")) %>%
  
  filter(word != "") %>%
  count(airline, word, sort = TRUE) 
total_2021 <- clean %>%
  mutate(text = tolower(text)) %>%
  unnest_tokens(output = word, input = text) %>% 
  anti_join(bind_rows(stop_words, data.frame(word = c("rt", "https"), 
                                             lexicon = "TWITTER")), 
            by = "word") %>%
  mutate(word =  gsub("[[:punct:][:blank:]]+", "", word)) %>%
  mutate(word = gsub("[0-9]+", "", word)) %>%
  mutate(word =  gsub("*\\b[[:alpha:]]{1,2}\\b *", "", word)) %>%
  mutate(word =  gsub("\\b[A-Z]+\\b", "", word)) %>%
  mutate(word = gsub("^ +| +$|( ) +", "\\1", word)) %>%
  mutate(word = str_replace(word,"alaska|delta|united|southwest|americanair","")) %>%
  
  filter(word != "") %>%
  count(airline, word, sort = TRUE) %>%
  group_by(airline) %>% 
  summarize(total = sum(n))
tidy_2021 <- left_join(tidy_2021, total_2021, by = 'airline')
valence <- inner_join(tidy_2021, get_sentiments("afinn"), by = "word")
violin_plot <- ggplot(valence, aes(x = airline, y = value, color = airline)) + 
  geom_violin( show.legend = FALSE) + 
  geom_boxplot(width=.1) +
  scale_y_continuous(breaks = seq(-5, 5, by = 1)) +
  labs(x = "Airlines", y = "AFINN Values") +
  ggtitle("Tweets Sentiment Value Distribution By Airlines") +
  theme(plot.title = element_text(vjust=2, hjust = 0.5),
        legend.position =  'none')

violin_plot

weight_plot <- valence %>%
  mutate(Contribution = n * value) %>%
  rename(Freq = n) %>%
  rename(Polarity = value) %>%
  group_by(airline) %>%
  slice_head(n = 5) %>%
  arrange(((Contribution))) %>%
  mutate(word = reorder(word, Contribution)) %>%
  ggplot(aes(x = Contribution, y = reorder(word, Contribution), 
             fill = Contribution > 0, label = Freq, label1 = Polarity)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~airline, ncol = 2, scales = "free") +
  labs(x = "Sentiment Value * Number of Appearances",
       y = 'Top 5 Words From Tweets') +
  ggtitle("Sentiment Value Weighted by Frequency of Words in Tweets") +
  theme(plot.title = element_text(vjust=2, hjust = 0.5),
        axis.title.x = element_text(vjust = -5),
        axis.title.y = element_text(vjust = -5),
        legend.position =  'none')

weight_plot

weight_plot_i <- 
  ggplotly(weight_plot, tooltip = c("contribution","label", "label1")) %>% 
  layout(autosize = F)

weight_plot_i 
LS0tCnRpdGxlOiAiRmluYWwgR3JhcGhzIgphdXRob3I6ICJBbmRyZXcgTGFpIgpkYXRlOiAiMDgvMDQvMjAyMSIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHllcwogICAgdGhlbWU6IHNwYWNlbGFiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkKYGBgCgpgYGB7ciwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkodGlkeXRleHQpCmxpYnJhcnkod2lkeXIpCmxpYnJhcnkoZ2dyYXBoKQpsaWJyYXJ5KGlncmFwaCkKbGlicmFyeSh0aWR5Z3JhcGgpCmxpYnJhcnkocGxvdGx5KQpgYGAKCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpzZXR3ZCgifi9kYXRhdml6MjAyMS9Hcm91cF9Qcm9qZWN0L0FuZHJld19UZXh0IikKYTIwMjEgPC0gcmVhZF9jc3YoImFpcmxpbmUyMDIxLmNzdiIpCiNhMjAxNSA8LSByZWFkX2NzdigiYWlybGluZXR3ZWV0czIwMTUuY3N2IikKYGBgCgpgYGB7cn0KZGYgPC0gYTIwMjEgJT4lCiAgc2VsZWN0KHN0YXR1c19pZCxjcmVhdGVkX2F0LHRleHQsCiAgICAgICAgIG1lbnRpb25zX3NjcmVlbl9uYW1lLGxvY2F0aW9uLHJldHdlZXRfY291bnQsIGhhc2h0YWdzKQpgYGAKCmBgYHtyfQpjaGVjayA8LSBkZiAlPiUKICBtdXRhdGUobWVudGlvbnNfc2NyZWVuX25hbWUgPSBzdHJfZXh0cmFjdF9hbGwobWVudGlvbnNfc2NyZWVuX25hbWUsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnKD88PSIpW0EtWmEtel0rJykpICU+JQogICB1bm5lc3QoLiwgbWVudGlvbnNfc2NyZWVuX25hbWUpICU+JQogIG11dGF0ZShoYXNodGFncyA9IHN0cl9leHRyYWN0X2FsbChoYXNodGFncywgJyg/PD0iKVtBLVphLXpdKycpKSAlPiUKICAgdW5uZXN0KC4sIGhhc2h0YWdzKSAlPiUKICBncm91cF9ieShzdGF0dXNfaWQpICU+JQogIG11dGF0ZShtZW50aW9uc19zY3JlZW5fbmFtZSA9CiAgICAgICAgICAgaWZlbHNlKGlzLm5hKG1lbnRpb25zX3NjcmVlbl9uYW1lKSxoYXNodGFncyxtZW50aW9uc19zY3JlZW5fbmFtZSkpICU+JQogIG11dGF0ZShoYXNodGFncyA9IAogICAgICAgICAgIGlmZWxzZShpcy5uYShoYXNodGFncyksIG1lbnRpb25zX3NjcmVlbl9uYW1lLGhhc2h0YWdzKSkgJT4lCiAgbXV0YXRlKG1lbnRpb25zX3NjcmVlbl9uYW1lID0gdG9sb3dlcihtZW50aW9uc19zY3JlZW5fbmFtZSkpCmBgYAoKCmBgYHtyfQojIFR3ZWV0cyB0aGF0IG9ubHkgcmVmZXJlbmNlIG9uIGFpcmxpbmUKY2xlYW4gPC0gY2hlY2sgJT4lCiAgZmlsdGVyKHN0cl9kZXRlY3QobWVudGlvbnNfc2NyZWVuX25hbWUsIAogICAgICAgICAgICAgICAgICAgICdhbGFza2F8ZGVsdGF8dW5pdGVkfHNvdXRod2VzdHxhbWVyaWNhbmFpcicpKSAlPiUKICBncm91cF9ieShzdGF0dXNfaWQpICU+JQogIG11dGF0ZShuID0gbigpLAogICAgICAgICBpc19kdXBlID0gaWZlbHNlKG4gPiAxLDEsMCkpICU+JQogIGZpbHRlcihpc19kdXBlID09IDApICU+JQogIGZpbHRlcihyb3dfbnVtYmVyKCk9PTEpICU+JQogIHNlbGVjdChzdGF0dXNfaWQsY3JlYXRlZF9hdCx0ZXh0LAogICAgICAgICBtZW50aW9uc19zY3JlZW5fbmFtZSxsb2NhdGlvbixyZXR3ZWV0X2NvdW50KSAlPiUKICByZW5hbWUoYWlybGluZSA9IG1lbnRpb25zX3NjcmVlbl9uYW1lKSAlPiUKICBtdXRhdGUoYWlybGluZSA9IGNhc2Vfd2hlbihhaXJsaW5lID09ICJhbGFza2FhaXIiIH4gIkFsYXNrYSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSAlaW4lIGMoImFtZXJpY2FuYWlyIiwiYW1lcmljYW5haXJsaW5lcyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgImFtZXJpY2FuYWlybG5lcyIpIH4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBbWVyaWNhbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSAlaW4lIGMoImRlbHRhIiwgImRlbHRhYWlybGluZSIpIH4gIkRlbHRhIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhaXJsaW5lID09ICJzb3V0aHdlc3RhaXIiIH4gIlNvdXRod2VzdCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSA9PSAidW5pdGVkIiB+ICJVbml0ZWQiKSkgJT4lCiAgdW5ncm91cCgpCmBgYAoKYGBge3J9CnRpZHlfMjAyMSA8LSBjbGVhbiAlPiUKICBtdXRhdGUodGV4dCA9IHRvbG93ZXIodGV4dCkpICU+JQogIHVubmVzdF90b2tlbnMob3V0cHV0ID0gd29yZCwgaW5wdXQgPSB0ZXh0KSAlPiUgCiAgYW50aV9qb2luKGJpbmRfcm93cyhzdG9wX3dvcmRzLCBkYXRhLmZyYW1lKHdvcmQgPSBjKCJydCIsICJodHRwcyIpLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbGV4aWNvbiA9ICJUV0lUVEVSIikpLCAKICAgICAgICAgICAgYnkgPSAid29yZCIpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIltbOnB1bmN0Ol1bOmJsYW5rOl1dKyIsICIiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBnc3ViKCJbMC05XSsiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIipcXGJbWzphbHBoYTpdXXsxLDJ9XFxiICoiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIlxcYltBLVpdK1xcYiIsICIiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBnc3ViKCJeICt8ICskfCggKSArIiwgIlxcMSIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9IHN0cl9yZXBsYWNlKHdvcmQsImFsYXNrYXxkZWx0YXx1bml0ZWR8c291dGh3ZXN0fGFtZXJpY2FuYWlyIiwiIikpICU+JQogIAogIGZpbHRlcih3b3JkICE9ICIiKSAlPiUKICBjb3VudChhaXJsaW5lLCB3b3JkLCBzb3J0ID0gVFJVRSkgCmBgYAoKYGBge3J9CnRvdGFsXzIwMjEgPC0gY2xlYW4gJT4lCiAgbXV0YXRlKHRleHQgPSB0b2xvd2VyKHRleHQpKSAlPiUKICB1bm5lc3RfdG9rZW5zKG91dHB1dCA9IHdvcmQsIGlucHV0ID0gdGV4dCkgJT4lIAogIGFudGlfam9pbihiaW5kX3Jvd3Moc3RvcF93b3JkcywgZGF0YS5mcmFtZSh3b3JkID0gYygicnQiLCAiaHR0cHMiKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxleGljb24gPSAiVFdJVFRFUiIpKSwgCiAgICAgICAgICAgIGJ5ID0gIndvcmQiKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCJbWzpwdW5jdDpdWzpibGFuazpdXSsiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gZ3N1YigiWzAtOV0rIiwgIiIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCIqXFxiW1s6YWxwaGE6XV17MSwyfVxcYiAqIiwgIiIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCJcXGJbQS1aXStcXGIiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gZ3N1YigiXiArfCArJHwoICkgKyIsICJcXDEiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBzdHJfcmVwbGFjZSh3b3JkLCJhbGFza2F8ZGVsdGF8dW5pdGVkfHNvdXRod2VzdHxhbWVyaWNhbmFpciIsIiIpKSAlPiUKICAKICBmaWx0ZXIod29yZCAhPSAiIikgJT4lCiAgY291bnQoYWlybGluZSwgd29yZCwgc29ydCA9IFRSVUUpICU+JQogIGdyb3VwX2J5KGFpcmxpbmUpICU+JSAKICBzdW1tYXJpemUodG90YWwgPSBzdW0obikpCmBgYAoKYGBge3J9CnRpZHlfMjAyMSA8LSBsZWZ0X2pvaW4odGlkeV8yMDIxLCB0b3RhbF8yMDIxLCBieSA9ICdhaXJsaW5lJykKYGBgCgoKYGBge3J9CnZhbGVuY2UgPC0gaW5uZXJfam9pbih0aWR5XzIwMjEsIGdldF9zZW50aW1lbnRzKCJhZmlubiIpLCBieSA9ICJ3b3JkIikKdmlvbGluX3Bsb3QgPC0gZ2dwbG90KHZhbGVuY2UsIGFlcyh4ID0gYWlybGluZSwgeSA9IHZhbHVlLCBjb2xvciA9IGFpcmxpbmUpKSArIAogIGdlb21fdmlvbGluKCBzaG93LmxlZ2VuZCA9IEZBTFNFKSArIAogIGdlb21fYm94cGxvdCh3aWR0aD0uMSkgKwogIHNjYWxlX3lfY29udGludW91cyhicmVha3MgPSBzZXEoLTUsIDUsIGJ5ID0gMSkpICsKICBsYWJzKHggPSAiQWlybGluZXMiLCB5ID0gIkFGSU5OIFZhbHVlcyIpICsKICBnZ3RpdGxlKCJUd2VldHMgU2VudGltZW50IFZhbHVlIERpc3RyaWJ1dGlvbiBCeSBBaXJsaW5lcyIpICsKICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KHZqdXN0PTIsIGhqdXN0ID0gMC41KSwKICAgICAgICBsZWdlbmQucG9zaXRpb24gPSAgJ25vbmUnKQoKdmlvbGluX3Bsb3QKYGBgCgoKYGBge3J9CndlaWdodF9wbG90IDwtIHZhbGVuY2UgJT4lCiAgbXV0YXRlKENvbnRyaWJ1dGlvbiA9IG4gKiB2YWx1ZSkgJT4lCiAgcmVuYW1lKEZyZXEgPSBuKSAlPiUKICByZW5hbWUoUG9sYXJpdHkgPSB2YWx1ZSkgJT4lCiAgZ3JvdXBfYnkoYWlybGluZSkgJT4lCiAgc2xpY2VfaGVhZChuID0gNSkgJT4lCiAgYXJyYW5nZSgoKENvbnRyaWJ1dGlvbikpKSAlPiUKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgQ29udHJpYnV0aW9uKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gQ29udHJpYnV0aW9uLCB5ID0gcmVvcmRlcih3b3JkLCBDb250cmlidXRpb24pLCAKICAgICAgICAgICAgIGZpbGwgPSBDb250cmlidXRpb24gPiAwLCBsYWJlbCA9IEZyZXEsIGxhYmVsMSA9IFBvbGFyaXR5KSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBmYWNldF93cmFwKH5haXJsaW5lLCBuY29sID0gMiwgc2NhbGVzID0gImZyZWUiKSArCiAgbGFicyh4ID0gIlNlbnRpbWVudCBWYWx1ZSAqIE51bWJlciBvZiBBcHBlYXJhbmNlcyIsCiAgICAgICB5ID0gJ1RvcCA1IFdvcmRzIEZyb20gVHdlZXRzJykgKwogIGdndGl0bGUoIlNlbnRpbWVudCBWYWx1ZSBXZWlnaHRlZCBieSBGcmVxdWVuY3kgb2YgV29yZHMgaW4gVHdlZXRzIikgKwogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQodmp1c3Q9MiwgaGp1c3QgPSAwLjUpLAogICAgICAgIGF4aXMudGl0bGUueCA9IGVsZW1lbnRfdGV4dCh2anVzdCA9IC01KSwKICAgICAgICBheGlzLnRpdGxlLnkgPSBlbGVtZW50X3RleHQodmp1c3QgPSAtNSksCiAgICAgICAgbGVnZW5kLnBvc2l0aW9uID0gICdub25lJykKCndlaWdodF9wbG90CmBgYAoKYGBge3J9CndlaWdodF9wbG90X2kgPC0gCiAgZ2dwbG90bHkod2VpZ2h0X3Bsb3QsIHRvb2x0aXAgPSBjKCJjb250cmlidXRpb24iLCJsYWJlbCIsICJsYWJlbDEiKSkgJT4lIAogIGxheW91dChhdXRvc2l6ZSA9IEYpCgp3ZWlnaHRfcGxvdF9pIApgYGA=